xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 5c38863b713405b10d72dc79afc7a8589bb6a11c)
1 /*
2  * Copyright 2006-2009, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/khash.h>
35 #include <util/list.h>
36 
37 #include "EndpointManager.h"
38 
39 
40 // References:
41 //  - RFC 793 - Transmission Control Protocol
42 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
43 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44 //
45 // Things this implementation currently doesn't implement:
46 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47 //	  RFC 2001, RFC 2581, RFC 3042
48 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
49 //	- Explicit Congestion Notification (ECN), RFC 3168
50 //	- SYN-Cache
51 //	- TCP Extensions for High Performance, RFC 1323
52 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
53 //	- Forward RTO-Recovery, RFC 4138
54 //	- Time-Wait hash instead of keeping sockets alive
55 
56 #define PrintAddress(address) \
57 	AddressString(Domain(), address, true).Data()
58 
59 //#define TRACE_TCP
60 //#define PROBE_TCP
61 
62 #ifdef TRACE_TCP
63 // the space before ', ##args' is important in order for this to work with cpp 2.95
64 #	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
65 		format "\n", find_thread(NULL), system_time(), this, \
66 		name_for_state(fState) , ##args)
67 #else
68 #	define TRACE(args...)			do { } while (0)
69 #endif
70 
71 #ifdef PROBE_TCP
72 #	define PROBE(buffer, window) \
73 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
74 		system_time(), PrintAddress(buffer->source), \
75 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
76 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
77 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
78 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
79 #else
80 #	define PROBE(buffer, window)	do { } while (0)
81 #endif
82 
83 #if TCP_TRACING
84 namespace TCPTracing {
85 
86 class Receive : public AbstractTraceEntry {
87 public:
88 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
89 			net_buffer* buffer)
90 		:
91 		fEndpoint(endpoint),
92 		fBuffer(buffer),
93 		fBufferSize(buffer->size),
94 		fSequence(segment.sequence),
95 		fAcknowledge(segment.acknowledge),
96 		fWindow(window),
97 		fState(endpoint->State()),
98 		fFlags(segment.flags)
99 	{
100 		Initialized();
101 	}
102 
103 	virtual void AddDump(TraceOutput& out)
104 	{
105 		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
106 			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
107 			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
108 	}
109 
110 protected:
111 	TCPEndpoint*	fEndpoint;
112 	net_buffer*		fBuffer;
113 	uint32			fBufferSize;
114 	uint32			fSequence;
115 	uint32			fAcknowledge;
116 	uint32			fWindow;
117 	tcp_state		fState;
118 	uint8			fFlags;
119 };
120 
121 class Send : public AbstractTraceEntry {
122 public:
123 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
124 			tcp_sequence firstSequence, tcp_sequence lastSequence)
125 		:
126 		fEndpoint(endpoint),
127 		fBuffer(buffer),
128 		fBufferSize(buffer->size),
129 		fSequence(segment.sequence),
130 		fAcknowledge(segment.acknowledge),
131 		fFirstSequence(firstSequence.Number()),
132 		fLastSequence(lastSequence.Number()),
133 		fState(endpoint->State()),
134 		fFlags(segment.flags)
135 	{
136 		Initialized();
137 	}
138 
139 	virtual void AddDump(TraceOutput& out)
140 	{
141 		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
142 			"seq %lu, ack %lu, first %lu, last %lu",
143 			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
144 			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
145 	}
146 
147 protected:
148 	TCPEndpoint*	fEndpoint;
149 	net_buffer*		fBuffer;
150 	uint32			fBufferSize;
151 	uint32			fSequence;
152 	uint32			fAcknowledge;
153 	uint32			fFirstSequence;
154 	uint32			fLastSequence;
155 	tcp_state		fState;
156 	uint8			fFlags;
157 };
158 
159 class State : public AbstractTraceEntry {
160 public:
161 	State(TCPEndpoint* endpoint)
162 		:
163 		fEndpoint(endpoint),
164 		fState(endpoint->State())
165 	{
166 		Initialized();
167 	}
168 
169 	virtual void AddDump(TraceOutput& out)
170 	{
171 		out.Print("tcp:%p (%12s) state change", fEndpoint,
172 			name_for_state(fState));
173 	}
174 
175 protected:
176 	TCPEndpoint*	fEndpoint;
177 	tcp_state		fState;
178 };
179 
180 class Spawn : public AbstractTraceEntry {
181 public:
182 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
183 		:
184 		fListeningEndpoint(listeningEndpoint),
185 		fSpawnedEndpoint(spawnedEndpoint)
186 	{
187 		Initialized();
188 	}
189 
190 	virtual void AddDump(TraceOutput& out)
191 	{
192 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
193 	}
194 
195 protected:
196 	TCPEndpoint*	fListeningEndpoint;
197 	TCPEndpoint*	fSpawnedEndpoint;
198 };
199 
200 class Error : public AbstractTraceEntry {
201 public:
202 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
203 		:
204 		fEndpoint(endpoint),
205 		fLine(line),
206 		fError(error),
207 		fState(endpoint->State())
208 	{
209 		Initialized();
210 	}
211 
212 	virtual void AddDump(TraceOutput& out)
213 	{
214 		out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint,
215 			name_for_state(fState), fLine, fError);
216 	}
217 
218 protected:
219 	TCPEndpoint*	fEndpoint;
220 	int32			fLine;
221 	const char*		fError;
222 	tcp_state		fState;
223 };
224 
225 class Timer : public AbstractTraceEntry {
226 public:
227 	Timer(TCPEndpoint* endpoint, const char* which)
228 		:
229 		fEndpoint(endpoint),
230 		fWhich(which),
231 		fState(endpoint->State())
232 	{
233 		Initialized();
234 	}
235 
236 	virtual void AddDump(TraceOutput& out)
237 	{
238 		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
239 			name_for_state(fState), fWhich);
240 	}
241 
242 protected:
243 	TCPEndpoint*	fEndpoint;
244 	const char*		fWhich;
245 	tcp_state		fState;
246 };
247 
248 }	// namespace TCPTracing
249 
250 #	define T(x)	new(std::nothrow) TCPTracing::x
251 #else
252 #	define T(x)
253 #endif	// TCP_TRACING
254 
255 // Initial estimate for packet round trip time (RTT)
256 #define TCP_INITIAL_RTT		2000000
257 
258 // constants for the fFlags field
259 enum {
260 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
261 	FLAG_OPTION_TIMESTAMP		= 0x02,
262 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
263 	//       That is, what is expected from accept() after a shutdown()
264 	//       is performed on a listen()ing socket.
265 	FLAG_NO_RECEIVE				= 0x04,
266 	FLAG_CLOSED					= 0x08,
267 	FLAG_DELETE_ON_CLOSE		= 0x10,
268 	FLAG_LOCAL					= 0x20
269 };
270 
271 
272 static const int kTimestampFactor = 1024;
273 
274 
275 static inline bigtime_t
276 absolute_timeout(bigtime_t timeout)
277 {
278 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
279 		return timeout;
280 
281 	return timeout + system_time();
282 }
283 
284 
285 static inline status_t
286 posix_error(status_t error)
287 {
288 	if (error == B_TIMED_OUT)
289 		return B_WOULD_BLOCK;
290 
291 	return error;
292 }
293 
294 
295 static inline bool
296 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
297 	uint32 receiveWindow)
298 {
299 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
300 }
301 
302 
303 static inline bool
304 segment_in_sequence(const tcp_segment_header& segment, int size,
305 	const tcp_sequence& receiveNext, uint32 receiveWindow)
306 {
307 	tcp_sequence sequence(segment.sequence);
308 	if (size == 0) {
309 		if (receiveWindow == 0)
310 			return sequence == receiveNext;
311 		return in_window(sequence, receiveNext, receiveWindow);
312 	} else {
313 		if (receiveWindow == 0)
314 			return false;
315 		return in_window(sequence, receiveNext, receiveWindow)
316 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
317 	}
318 }
319 
320 
321 static inline bool
322 is_writable(tcp_state state)
323 {
324 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
325 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
326 }
327 
328 
329 static inline uint32 tcp_now()
330 {
331 	return system_time() / kTimestampFactor;
332 }
333 
334 
335 static inline uint32 tcp_diff_timestamp(uint32 base)
336 {
337 	uint32 now = tcp_now();
338 
339 	if (now > base)
340 		return now - base;
341 
342 	return now + UINT_MAX - base;
343 }
344 
345 
346 static inline bool
347 state_needs_finish(int32 state)
348 {
349 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
350 		|| state == FINISH_SENT || state == CLOSING;
351 }
352 
353 
354 //	#pragma mark -
355 
356 
357 WaitList::WaitList(const char* name)
358 {
359 	fCondition = 0;
360 	fSem = create_sem(0, name);
361 }
362 
363 
364 WaitList::~WaitList()
365 {
366 	delete_sem(fSem);
367 }
368 
369 
370 status_t
371 WaitList::InitCheck() const
372 {
373 	return fSem;
374 }
375 
376 
377 status_t
378 WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
379 {
380 	locker.Unlock();
381 
382 	status_t status = B_OK;
383 
384 	while (!atomic_test_and_set(&fCondition, 0, 1)) {
385 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
386 			timeout);
387 		if (status != B_OK)
388 			break;
389 	}
390 
391 	locker.Lock();
392 	return status;
393 }
394 
395 
396 void
397 WaitList::Signal()
398 {
399 	atomic_or(&fCondition, 1);
400 #ifdef __HAIKU__
401 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
402 #else
403 	int32 count;
404 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
405 		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
406 #endif
407 }
408 
409 
410 //	#pragma mark -
411 
412 
413 TCPEndpoint::TCPEndpoint(net_socket* socket)
414 	: ProtocolSocket(socket),
415 	fManager(NULL),
416 	fReceiveList("tcp receive"),
417 	fSendList("tcp send"),
418 	fOptions(0),
419 	fSendWindowShift(0),
420 	fReceiveWindowShift(0),
421 	fSendUnacknowledged(0),
422 	fSendNext(0),
423 	fSendMax(0),
424 	fSendUrgentOffset(0),
425 	fSendWindow(0),
426 	fSendMaxWindow(0),
427 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
428 	fSendQueue(socket->send.buffer_size),
429 	fInitialSendSequence(0),
430 	fDuplicateAcknowledgeCount(0),
431 	fRoute(NULL),
432 	fReceiveNext(0),
433 	fReceiveMaxAdvertised(0),
434 	fReceiveWindow(socket->receive.buffer_size),
435 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
436 	fReceiveQueue(socket->receive.buffer_size),
437 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
438 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
439 	fRetransmitTimeout(TCP_INITIAL_RTT),
440 	fReceivedTimestamp(0),
441 	fCongestionWindow(0),
442 	fSlowStartThreshold(0),
443 	fState(CLOSED),
444 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
445 {
446 	// TODO: to be replaced with a real read/write locking strategy!
447 	mutex_init(&fLock, "tcp lock");
448 
449 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
450 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
451 		this);
452 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
453 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
454 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
455 		this);
456 }
457 
458 
459 TCPEndpoint::~TCPEndpoint()
460 {
461 	mutex_lock(&fLock);
462 
463 	_CancelConnectionTimers();
464 	gStackModule->cancel_timer(&fTimeWaitTimer);
465 
466 	if (fManager != NULL) {
467 		fManager->Unbind(this);
468 		put_endpoint_manager(fManager);
469 	}
470 
471 	mutex_destroy(&fLock);
472 
473 	// we need to wait for all timers to return
474 	gStackModule->wait_for_timer(&fRetransmitTimer);
475 	gStackModule->wait_for_timer(&fPersistTimer);
476 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
477 	gStackModule->wait_for_timer(&fTimeWaitTimer);
478 
479 	gDatalinkModule->put_route(Domain(), fRoute);
480 }
481 
482 
483 status_t
484 TCPEndpoint::InitCheck() const
485 {
486 	if (fReceiveList.InitCheck() < B_OK)
487 		return fReceiveList.InitCheck();
488 
489 	if (fSendList.InitCheck() < B_OK)
490 		return fSendList.InitCheck();
491 
492 	return B_OK;
493 }
494 
495 
496 //	#pragma mark - protocol API
497 
498 
499 status_t
500 TCPEndpoint::Open()
501 {
502 	TRACE("Open()");
503 
504 	status_t status = ProtocolSocket::Open();
505 	if (status < B_OK)
506 		return status;
507 
508 	fManager = get_endpoint_manager(Domain());
509 	if (fManager == NULL)
510 		return EAFNOSUPPORT;
511 
512 	return B_OK;
513 }
514 
515 
516 status_t
517 TCPEndpoint::Close()
518 {
519 	TRACE("Close()");
520 
521 	MutexLocker locker(fLock);
522 
523 	if (fState == LISTEN)
524 		delete_sem(fAcceptSemaphore);
525 
526 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
527 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
528 		fState = CLOSED;
529 		T(State(this));
530 		return B_OK;
531 	}
532 
533 	status_t status = _Disconnect(true);
534 	if (status != B_OK)
535 		return status;
536 
537 	if (socket->options & SO_LINGER) {
538 		TRACE("Close(): Lingering for %i secs", socket->linger);
539 
540 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
541 
542 		while (fSendQueue.Used() > 0) {
543 			status = fSendList.Wait(locker, maximum);
544 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
545 				break;
546 			else if (status < B_OK)
547 				return status;
548 		}
549 
550 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
551 			fSendQueue.Used());
552 	}
553 	return B_OK;
554 }
555 
556 
557 void
558 TCPEndpoint::Free()
559 {
560 	TRACE("Free()");
561 
562 	MutexLocker _(fLock);
563 
564 	if (fState <= SYNCHRONIZE_SENT)
565 		return;
566 
567 	// we are only interested in the timer, not in changing state
568 	_EnterTimeWait();
569 
570 	fFlags |= FLAG_CLOSED;
571 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
572 		// we'll be freed later when the 2MSL timer expires
573 		gSocketModule->acquire_socket(socket);
574 	}
575 }
576 
577 
578 /*!	Creates and sends a synchronize packet to /a address, and then waits
579 	until the connection has been established or refused.
580 */
581 status_t
582 TCPEndpoint::Connect(const sockaddr* address)
583 {
584 	TRACE("Connect() on address %s", PrintAddress(address));
585 
586 	MutexLocker locker(fLock);
587 
588 	if (gStackModule->is_restarted_syscall()) {
589 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
590 		status_t status = _WaitForEstablished(locker, timeout);
591 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
592 			strerror(status), timeout);
593 		return posix_error(status);
594 	}
595 
596 	// Can only call connect() from CLOSED or LISTEN states
597 	// otherwise endpoint is considered already connected
598 	if (fState == LISTEN) {
599 		// this socket is about to connect; remove pending connections in the backlog
600 		gSocketModule->set_max_backlog(socket, 0);
601 	} else if (fState == ESTABLISHED) {
602 		return EISCONN;
603 	} else if (fState != CLOSED)
604 		return EINPROGRESS;
605 
606 	// TODO: this is IPv4 specific, and doesn't belong here!
607 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
608 	sockaddr_in _address;
609 	if (((sockaddr_in*)address)->sin_addr.s_addr == INADDR_ANY) {
610 		memcpy(&_address, address, sizeof(sockaddr_in));
611 		_address.sin_len = sizeof(sockaddr_in);
612 		_address.sin_addr.s_addr = INADDR_LOOPBACK;
613 		address = (sockaddr*)&_address;
614 	}
615 
616 	status_t status = _PrepareSendPath(address);
617 	if (status < B_OK)
618 		return status;
619 
620 	TRACE("  Connect(): starting 3-way handshake...");
621 
622 	fState = SYNCHRONIZE_SENT;
623 	T(State(this));
624 
625 	// send SYN
626 	status = _SendQueued();
627 	if (status != B_OK) {
628 		_Close();
629 		return status;
630 	}
631 
632 	// If we are running over Loopback, after _SendQueued() returns we
633 	// may be in ESTABLISHED already.
634 	if (fState == ESTABLISHED) {
635 		TRACE("  Connect() completed after _SendQueued()");
636 		return B_OK;
637 	}
638 
639 	// wait until 3-way handshake is complete (if needed)
640 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
641 	if (timeout == 0) {
642 		// we're a non-blocking socket
643 		TRACE("  Connect() delayed, return EINPROGRESS");
644 		return EINPROGRESS;
645 	}
646 
647 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
648 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
649 
650 	status = _WaitForEstablished(locker, absoluteTimeout);
651 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
652 		strerror(status), timeout);
653 	return posix_error(status);
654 }
655 
656 
657 status_t
658 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
659 {
660 	TRACE("Accept()");
661 
662 	MutexLocker locker(fLock);
663 
664 	status_t status;
665 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
666 	if (gStackModule->is_restarted_syscall())
667 		timeout = gStackModule->restore_syscall_restart_timeout();
668 	else
669 		gStackModule->store_syscall_restart_timeout(timeout);
670 
671 	do {
672 		locker.Unlock();
673 
674 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
675 			| B_CAN_INTERRUPT, timeout);
676 		if (status != B_OK)
677 			return status;
678 
679 		locker.Lock();
680 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
681 #ifdef TRACE_TCP
682 		if (status == B_OK)
683 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
684 #endif
685 	} while (status != B_OK);
686 
687 	return status;
688 }
689 
690 
691 status_t
692 TCPEndpoint::Bind(const sockaddr *address)
693 {
694 	if (address == NULL)
695 		return B_BAD_VALUE;
696 
697 	MutexLocker lock(fLock);
698 
699 	TRACE("Bind() on address %s", PrintAddress(address));
700 
701 	if (fState != CLOSED)
702 		return EISCONN;
703 
704 	return fManager->Bind(this, address);
705 }
706 
707 
708 status_t
709 TCPEndpoint::Unbind(struct sockaddr *address)
710 {
711 	TRACE("Unbind()");
712 
713 	MutexLocker _(fLock);
714 	return fManager->Unbind(this);
715 }
716 
717 
718 status_t
719 TCPEndpoint::Listen(int count)
720 {
721 	TRACE("Listen()");
722 
723 	MutexLocker _(fLock);
724 
725 	if (fState != CLOSED)
726 		return B_BAD_VALUE;
727 
728 	fAcceptSemaphore = create_sem(0, "tcp accept");
729 	if (fAcceptSemaphore < B_OK)
730 		return ENOBUFS;
731 
732 	status_t status = fManager->SetPassive(this);
733 	if (status != B_OK) {
734 		delete_sem(fAcceptSemaphore);
735 		fAcceptSemaphore = -1;
736 		return status;
737 	}
738 
739 	gSocketModule->set_max_backlog(socket, count);
740 
741 	fState = LISTEN;
742 	T(State(this));
743 	return B_OK;
744 }
745 
746 
747 status_t
748 TCPEndpoint::Shutdown(int direction)
749 {
750 	TRACE("Shutdown(%i)", direction);
751 
752 	MutexLocker lock(fLock);
753 
754 	if (direction == SHUT_RD || direction == SHUT_RDWR)
755 		fFlags |= FLAG_NO_RECEIVE;
756 
757 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
758 		// TODO: That's not correct. After read/write shutting down the socket
759 		// one should still be able to read previously arrived data.
760 		_Disconnect(false);
761 	}
762 
763 	return B_OK;
764 }
765 
766 
767 /*!	Puts data contained in \a buffer into send buffer */
768 status_t
769 TCPEndpoint::SendData(net_buffer *buffer)
770 {
771 	MutexLocker lock(fLock);
772 
773 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
774 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
775 		  fSendQueue.Free());
776 
777 	if (fState == CLOSED)
778 		return ENOTCONN;
779 	if (fState == LISTEN)
780 		return EDESTADDRREQ;
781 	if (!is_writable(fState)) {
782 		// we only send signals when called from userland
783 		if (gStackModule->is_syscall())
784 			send_signal(find_thread(NULL), SIGPIPE);
785 		return EPIPE;
786 	}
787 
788 	uint32 flags = buffer->flags;
789 	size_t left = buffer->size;
790 
791 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
792 	if (gStackModule->is_restarted_syscall())
793 		timeout = gStackModule->restore_syscall_restart_timeout();
794 	else
795 		gStackModule->store_syscall_restart_timeout(timeout);
796 
797 	while (left > 0) {
798 		while (fSendQueue.Free() < socket->send.low_water_mark) {
799 			// wait until enough space is available
800 			status_t status = fSendList.Wait(lock, timeout);
801 			if (status < B_OK) {
802 				TRACE("  SendData() returning %s (%d)",
803 					strerror(posix_error(status)), (int)posix_error(status));
804 				return posix_error(status);
805 			}
806 
807 			if (!is_writable(fState)) {
808 				// we only send signals when called from userland
809 				if (gStackModule->is_syscall())
810 					send_signal(find_thread(NULL), SIGPIPE);
811 				return EPIPE;
812 			}
813 		}
814 
815 		size_t size = fSendQueue.Free();
816 		if (size < left) {
817 			// we need to split the original buffer
818 			net_buffer* clone = gBufferModule->clone(buffer, false);
819 				// TODO: add offset/size parameter to net_buffer::clone() or
820 				// even a move_data() function, as this is a bit inefficient
821 			if (clone == NULL)
822 				return ENOBUFS;
823 
824 			status_t status = gBufferModule->trim(clone, size);
825 			if (status != B_OK) {
826 				gBufferModule->free(clone);
827 				return status;
828 			}
829 
830 			gBufferModule->remove_header(buffer, size);
831 			left -= size;
832 			fSendQueue.Add(clone);
833 		} else {
834 			left -= buffer->size;
835 			fSendQueue.Add(buffer);
836 		}
837 	}
838 
839 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
840 
841 	bool force = false;
842 	if ((flags & MSG_OOB) != 0) {
843 		fSendUrgentOffset = fSendQueue.LastSequence();
844 			// RFC 961 specifies that the urgent offset points to the last
845 			// byte of urgent data. However, this is commonly implemented as
846 			// here, ie. it points to the first byte after the urgent data.
847 		force = true;
848 	}
849 	if ((flags & MSG_EOF) != 0)
850 		_Disconnect(false);
851 
852 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
853 		_SendQueued(force);
854 
855 	return B_OK;
856 }
857 
858 
859 ssize_t
860 TCPEndpoint::SendAvailable()
861 {
862 	MutexLocker locker(fLock);
863 
864 	ssize_t available;
865 
866 	if (is_writable(fState))
867 		available = fSendQueue.Free();
868 	else
869 		available = EPIPE;
870 
871 	TRACE("SendAvailable(): %li", available);
872 	return available;
873 }
874 
875 
876 status_t
877 TCPEndpoint::FillStat(net_stat *stat)
878 {
879 	MutexLocker _(fLock);
880 
881 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
882 	stat->receive_queue_size = fReceiveQueue.Available();
883 	stat->send_queue_size = fSendQueue.Used();
884 
885 	return B_OK;
886 }
887 
888 
889 status_t
890 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
891 {
892 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
893 
894 	MutexLocker locker(fLock);
895 
896 	*_buffer = NULL;
897 
898 	if (fState == CLOSED)
899 		return ENOTCONN;
900 
901 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
902 	if (gStackModule->is_restarted_syscall())
903 		timeout = gStackModule->restore_syscall_restart_timeout();
904 	else
905 		gStackModule->store_syscall_restart_timeout(timeout);
906 
907 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
908 		if (flags & MSG_DONTWAIT)
909 			return B_WOULD_BLOCK;
910 
911 		status_t status = _WaitForEstablished(locker, timeout);
912 		if (status < B_OK)
913 			return posix_error(status);
914 	}
915 
916 	size_t dataNeeded = socket->receive.low_water_mark;
917 
918 	// When MSG_WAITALL is set then the function should block
919 	// until the full amount of data can be returned.
920 	if (flags & MSG_WAITALL)
921 		dataNeeded = numBytes;
922 
923 	// TODO: add support for urgent data (MSG_OOB)
924 
925 	while (true) {
926 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
927 			|| fState == TIME_WAIT) {
928 			// ``Connection closing''.
929 			return B_OK;
930 		}
931 
932 		if (fReceiveQueue.Available() > 0) {
933 			if (fReceiveQueue.Available() >= dataNeeded
934 				|| (fReceiveQueue.PushedData() > 0
935 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
936 				break;
937 		} else if (fState == FINISH_RECEIVED) {
938 			// ``If no text is awaiting delivery, the RECEIVE will
939 			//   get a Connection closing''.
940 			return B_OK;
941 		}
942 
943 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
944 			return B_WOULD_BLOCK;
945 
946 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
947 			return B_OK;
948 
949 		status_t status = fReceiveList.Wait(locker, timeout);
950 		if (status < B_OK) {
951 			// The Open Group base specification mentions that EINTR should be
952 			// returned if the recv() is interrupted before _any data_ is
953 			// available. So we actually check if there is data, and if so,
954 			// push it to the user.
955 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
956 				&& fReceiveQueue.Available() > 0)
957 				break;
958 
959 			return posix_error(status);
960 		}
961 	}
962 
963 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
964 
965 	if (numBytes < fReceiveQueue.Available())
966 		fReceiveList.Signal();
967 
968 	bool clone = (flags & MSG_PEEK) != 0;
969 
970 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
971 
972 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
973 
974 	// if we are opening the window, check if we should send an ACK
975 	if (!clone)
976 		SendAcknowledge(false);
977 
978 	return receivedBytes;
979 }
980 
981 
982 ssize_t
983 TCPEndpoint::ReadAvailable()
984 {
985 	MutexLocker locker(fLock);
986 
987 	TRACE("ReadAvailable(): %li", _AvailableData());
988 
989 	return _AvailableData();
990 }
991 
992 
993 status_t
994 TCPEndpoint::SetSendBufferSize(size_t length)
995 {
996 	MutexLocker _(fLock);
997 	fSendQueue.SetMaxBytes(length);
998 	return B_OK;
999 }
1000 
1001 
1002 status_t
1003 TCPEndpoint::SetReceiveBufferSize(size_t length)
1004 {
1005 	MutexLocker _(fLock);
1006 	fReceiveQueue.SetMaxBytes(length);
1007 	return B_OK;
1008 }
1009 
1010 
1011 status_t
1012 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1013 {
1014 	if (*_length != sizeof(int))
1015 		return B_BAD_VALUE;
1016 
1017 	int* value = (int*)_value;
1018 
1019 	switch (option) {
1020 		case TCP_NODELAY:
1021 			if ((fOptions & TCP_NODELAY) != 0)
1022 				*value = 1;
1023 			else
1024 				*value = 0;
1025 			return B_OK;
1026 
1027 		case TCP_MAXSEG:
1028 			*value = fReceiveMaxSegmentSize;
1029 			return B_OK;
1030 
1031 		default:
1032 			return B_BAD_VALUE;
1033 	}
1034 }
1035 
1036 
1037 status_t
1038 TCPEndpoint::SetOption(int option, const void* _value, int length)
1039 {
1040 	if (option != TCP_NODELAY)
1041 		return B_BAD_VALUE;
1042 
1043 	if (length != sizeof(int))
1044 		return B_BAD_VALUE;
1045 
1046 	const int* value = (const int*)_value;
1047 
1048 	MutexLocker _(fLock);
1049 	if (*value)
1050 		fOptions |= TCP_NODELAY;
1051 	else
1052 		fOptions &= ~TCP_NODELAY;
1053 
1054 	return B_OK;
1055 }
1056 
1057 
1058 //	#pragma mark - misc
1059 
1060 
1061 bool
1062 TCPEndpoint::IsBound() const
1063 {
1064 	return !LocalAddress().IsEmpty(true);
1065 }
1066 
1067 
1068 bool
1069 TCPEndpoint::IsLocal() const
1070 {
1071 	return (fFlags & FLAG_LOCAL) != 0;
1072 }
1073 
1074 
1075 status_t
1076 TCPEndpoint::DelayedAcknowledge()
1077 {
1078 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1079 		// timer was active, send an ACK now (with the exception above,
1080 		// we send every other ACK)
1081 		return SendAcknowledge(true);
1082 	}
1083 
1084 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1085 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1086 	return B_OK;
1087 }
1088 
1089 
1090 status_t
1091 TCPEndpoint::SendAcknowledge(bool force)
1092 {
1093 	return _SendQueued(force, 0);
1094 }
1095 
1096 
1097 void
1098 TCPEndpoint::_StartPersistTimer()
1099 {
1100 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1101 }
1102 
1103 
1104 void
1105 TCPEndpoint::_EnterTimeWait()
1106 {
1107 	TRACE("_EnterTimeWait()\n");
1108 
1109 	_CancelConnectionTimers();
1110 
1111 	if (fState == TIME_WAIT && IsLocal()) {
1112 		// we do not use TIME_WAIT state for local connections
1113 		fFlags |= FLAG_DELETE_ON_CLOSE;
1114 		return;
1115 	}
1116 
1117 	_UpdateTimeWait();
1118 }
1119 
1120 
1121 void
1122 TCPEndpoint::_UpdateTimeWait()
1123 {
1124 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1125 }
1126 
1127 
1128 void
1129 TCPEndpoint::_CancelConnectionTimers()
1130 {
1131 	gStackModule->cancel_timer(&fRetransmitTimer);
1132 	gStackModule->cancel_timer(&fPersistTimer);
1133 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1134 }
1135 
1136 
1137 /*!	Sends the FIN flag to the peer when the connection is still open.
1138 	Moves the endpoint to the next state depending on where it was.
1139 */
1140 status_t
1141 TCPEndpoint::_Disconnect(bool closing)
1142 {
1143 	tcp_state previousState = fState;
1144 
1145 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1146 		fState = FINISH_SENT;
1147 	else if (fState == FINISH_RECEIVED)
1148 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1149 	else
1150 		return B_OK;
1151 
1152 	T(State(this));
1153 
1154 	status_t status = _SendQueued();
1155 	if (status != B_OK) {
1156 		fState = previousState;
1157 		T(State(this));
1158 		return status;
1159 	}
1160 
1161 	return B_OK;
1162 }
1163 
1164 
1165 void
1166 TCPEndpoint::_MarkEstablished()
1167 {
1168 	fState = ESTABLISHED;
1169 	T(State(this));
1170 
1171 	if (gSocketModule->has_parent(socket)) {
1172 		gSocketModule->set_connected(socket);
1173 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1174 	}
1175 
1176 	fSendList.Signal();
1177 }
1178 
1179 
1180 status_t
1181 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1182 {
1183 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1184 	// When investigating this, also have a look at _Close() and _HandleReset().
1185 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1186 		if (socket->error != B_OK)
1187 			return socket->error;
1188 
1189 		status_t status = fSendList.Wait(locker, timeout);
1190 		if (status < B_OK)
1191 			return status;
1192 	}
1193 
1194 	return B_OK;
1195 }
1196 
1197 
1198 //	#pragma mark - receive
1199 
1200 
1201 void
1202 TCPEndpoint::_Close()
1203 {
1204 	_CancelConnectionTimers();
1205 	fState = CLOSED;
1206 	T(State(this));
1207 
1208 	fFlags |= FLAG_DELETE_ON_CLOSE;
1209 
1210 	fSendList.Signal();
1211 	_NotifyReader();
1212 
1213 	if (gSocketModule->has_parent(socket)) {
1214 		// We still have a parent - obviously, we haven't been accepted yet,
1215 		// so no one could ever close us.
1216 		_CancelConnectionTimers();
1217 		gSocketModule->set_aborted(socket);
1218 	}
1219 }
1220 
1221 
1222 void
1223 TCPEndpoint::_HandleReset(status_t error)
1224 {
1225 	socket->error = error;
1226 	_Close();
1227 
1228 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1229 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1230 }
1231 
1232 
1233 void
1234 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1235 {
1236 	if (++fDuplicateAcknowledgeCount < 3)
1237 		return;
1238 
1239 	if (fDuplicateAcknowledgeCount == 3) {
1240 		_ResetSlowStart();
1241 		fCongestionWindow = fSlowStartThreshold + 3
1242 			* fSendMaxSegmentSize;
1243 		fSendNext = segment.acknowledge;
1244 	} else if (fDuplicateAcknowledgeCount > 3)
1245 		fCongestionWindow += fSendMaxSegmentSize;
1246 
1247 	_SendQueued();
1248 }
1249 
1250 
1251 void
1252 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1253 	size_t segmentLength)
1254 {
1255 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1256 		tcp_sequence sequence(segment.sequence);
1257 
1258 		if (fLastAcknowledgeSent >= sequence
1259 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1260 			fReceivedTimestamp = segment.timestamp_value;
1261 	}
1262 }
1263 
1264 
1265 ssize_t
1266 TCPEndpoint::_AvailableData() const
1267 {
1268 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1269 	//       the application of FLAG_NO_RECEIVE in listen()ing
1270 	//       sockets.
1271 	if (fState == LISTEN)
1272 		return gSocketModule->count_connected(socket);
1273 	if (fState == SYNCHRONIZE_SENT)
1274 		return 0;
1275 
1276 	ssize_t availableData = fReceiveQueue.Available();
1277 
1278 	if (availableData == 0 && !_ShouldReceive())
1279 		return ENOTCONN;
1280 
1281 	return availableData;
1282 }
1283 
1284 
1285 void
1286 TCPEndpoint::_NotifyReader()
1287 {
1288 	fReceiveList.Signal();
1289 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1290 }
1291 
1292 
1293 bool
1294 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1295 {
1296 	fReceiveQueue.Add(buffer, segment.sequence);
1297 	fReceiveNext = fReceiveQueue.NextSequence();
1298 
1299 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1300 		(uint32)fReceiveNext, fReceiveQueue.Available());
1301 
1302 	if (segment.flags & TCP_FLAG_PUSH)
1303 		fReceiveQueue.SetPushPointer();
1304 
1305 	return fReceiveQueue.Available() > 0;
1306 }
1307 
1308 
1309 void
1310 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1311 {
1312 	fInitialReceiveSequence = segment.sequence;
1313 
1314 	// count the received SYN
1315 	segment.sequence++;
1316 
1317 	fReceiveNext = segment.sequence;
1318 	fReceiveQueue.SetInitialSequence(segment.sequence);
1319 
1320 	if ((fOptions & TCP_NOOPT) == 0) {
1321 		if (segment.max_segment_size > 0)
1322 			fSendMaxSegmentSize = segment.max_segment_size;
1323 
1324 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1325 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1326 			fSendWindowShift = segment.window_shift;
1327 		} else {
1328 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1329 			fReceiveWindowShift = 0;
1330 		}
1331 
1332 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1333 			fFlags |= FLAG_OPTION_TIMESTAMP;
1334 			fReceivedTimestamp = segment.timestamp_value;
1335 		} else
1336 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1337 	}
1338 
1339 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1340 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1341 }
1342 
1343 
1344 bool
1345 TCPEndpoint::_ShouldReceive() const
1346 {
1347 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1348 		return false;
1349 
1350 	return fState == ESTABLISHED || fState == FINISH_SENT
1351 		|| fState == FINISH_ACKNOWLEDGED;
1352 }
1353 
1354 
1355 int32
1356 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1357 	net_buffer* buffer)
1358 {
1359 	MutexLocker _(fLock);
1360 
1361 	// TODO error checking
1362 	ProtocolSocket::Open();
1363 
1364 	fState = SYNCHRONIZE_RECEIVED;
1365 	T(Spawn(parent, this));
1366 
1367 	fManager = parent->fManager;
1368 
1369 	LocalAddress().SetTo(buffer->destination);
1370 	PeerAddress().SetTo(buffer->source);
1371 
1372 	TRACE("Spawn()");
1373 
1374 	// TODO: proper error handling!
1375 	if (fManager->BindChild(this) != B_OK) {
1376 		T(Error(this, "binding failed", __LINE__));
1377 		return DROP;
1378 	}
1379 	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1380 		T(Error(this, "prepare send faild", __LINE__));
1381 		return DROP;
1382 	}
1383 
1384 	fOptions = parent->fOptions;
1385 	fAcceptSemaphore = parent->fAcceptSemaphore;
1386 
1387 	_PrepareReceivePath(segment);
1388 
1389 	// send SYN+ACK
1390 	if (_SendQueued() != B_OK) {
1391 		T(Error(this, "sending failed", __LINE__));
1392 		return DROP;
1393 	}
1394 
1395 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1396 		// we handled this flag now, it must not be set for further processing
1397 
1398 	return _Receive(segment, buffer);
1399 }
1400 
1401 
1402 int32
1403 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1404 {
1405 	TRACE("ListenReceive()");
1406 
1407 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1408 	// but the error behaviour differs
1409 	if (segment.flags & TCP_FLAG_RESET)
1410 		return DROP;
1411 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1412 		return DROP | RESET;
1413 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1414 		return DROP;
1415 
1416 	// TODO: drop broadcast/multicast
1417 
1418 	// spawn new endpoint for accept()
1419 	net_socket* newSocket;
1420 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1421 		T(Error(this, "spawning failed", __LINE__));
1422 		return DROP;
1423 	}
1424 
1425 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1426 		segment, buffer);
1427 }
1428 
1429 
1430 int32
1431 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1432 	net_buffer *buffer)
1433 {
1434 	TRACE("_SynchronizeSentReceive()");
1435 
1436 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1437 		&& (fInitialSendSequence >= segment.acknowledge
1438 			|| fSendMax < segment.acknowledge))
1439 		return DROP | RESET;
1440 
1441 	if (segment.flags & TCP_FLAG_RESET) {
1442 		_HandleReset(ECONNREFUSED);
1443 		return DROP;
1444 	}
1445 
1446 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1447 		return DROP;
1448 
1449 	fSendUnacknowledged = segment.acknowledge;
1450 	_PrepareReceivePath(segment);
1451 
1452 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1453 		_MarkEstablished();
1454 	} else {
1455 		// simultaneous open
1456 		fState = SYNCHRONIZE_RECEIVED;
1457 		T(State(this));
1458 	}
1459 
1460 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1461 		// we handled this flag now, it must not be set for further processing
1462 
1463 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1464 }
1465 
1466 
1467 int32
1468 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1469 {
1470 	uint32 advertisedWindow = (uint32)segment.advertised_window
1471 		<< fSendWindowShift;
1472 	size_t segmentLength = buffer->size;
1473 
1474 	// First, handle the most common case for uni-directional data transfer
1475 	// (known as header prediction - the segment must not change the window,
1476 	// and must be the expected sequence, and contain no control flags)
1477 
1478 	if (fState == ESTABLISHED
1479 		&& segment.AcknowledgeOnly()
1480 		&& fReceiveNext == segment.sequence
1481 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1482 		&& fSendNext == fSendMax) {
1483 		_UpdateTimestamps(segment, segmentLength);
1484 
1485 		if (segmentLength == 0) {
1486 			// this is a pure acknowledge segment - we're on the sending end
1487 			if (fSendUnacknowledged < segment.acknowledge
1488 				&& fSendMax >= segment.acknowledge) {
1489 				_Acknowledged(segment);
1490 				return DROP;
1491 			}
1492 		} else if (segment.acknowledge == fSendUnacknowledged
1493 			&& fReceiveQueue.IsContiguous()
1494 			&& fReceiveQueue.Free() >= segmentLength
1495 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1496 			if (_AddData(segment, buffer))
1497 				_NotifyReader();
1498 
1499 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1500 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1501 		}
1502 	}
1503 
1504 	// The fast path was not applicable, so we continue with the standard
1505 	// processing of the incoming segment
1506 
1507 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1508 
1509 	if (fState != CLOSED && fState != TIME_WAIT) {
1510 		// Check sequence number
1511 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1512 				fReceiveWindow)) {
1513 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1514 				(uint32)fReceiveNext, fReceiveWindow);
1515 			if (segment.flags & TCP_FLAG_RESET) {
1516 				// TODO: this doesn't look right - review!
1517 				return DROP;
1518 			}
1519 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1520 		}
1521 	}
1522 
1523 	if (segment.flags & TCP_FLAG_RESET) {
1524 		// Is this a valid reset?
1525 		// We generally ignore resets in time wait state (see RFC 1337)
1526 		if (fLastAcknowledgeSent <= segment.sequence
1527 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1528 				+ fReceiveWindow)
1529 			&& fState != TIME_WAIT) {
1530 			status_t error;
1531 			if (fState == SYNCHRONIZE_RECEIVED)
1532 				error = ECONNREFUSED;
1533 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1534 				error = ENOTCONN;
1535 			else
1536 				error = ECONNRESET;
1537 
1538 			_HandleReset(error);
1539 		}
1540 
1541 		return DROP;
1542 	}
1543 
1544 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1545 		|| (fState == SYNCHRONIZE_RECEIVED
1546 			&& (fInitialReceiveSequence > segment.sequence
1547 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1548 					&& (fSendUnacknowledged > segment.acknowledge
1549 						|| fSendMax < segment.acknowledge)))) {
1550 		// reset the connection - either the initial SYN was faulty, or we
1551 		// received a SYN within the data stream
1552 		return DROP | RESET;
1553 	}
1554 
1555 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1556 		// the window must not shrink
1557 
1558 	// trim buffer to be within the receive window
1559 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1560 	if (drop > 0) {
1561 		if ((uint32)drop > buffer->size
1562 			|| ((uint32)drop == buffer->size
1563 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1564 			// don't accidently remove a FIN we shouldn't remove
1565 			segment.flags &= ~TCP_FLAG_FINISH;
1566 			drop = buffer->size;
1567 		}
1568 
1569 		// remove duplicate data at the start
1570 		TRACE("* remove %ld bytes from the start", drop);
1571 		gBufferModule->remove_header(buffer, drop);
1572 		segment.sequence += drop;
1573 	}
1574 
1575 	int32 action = KEEP;
1576 
1577 	drop = (int32)(segment.sequence + buffer->size
1578 		- (fReceiveNext + fReceiveWindow)).Number();
1579 	if (drop > 0) {
1580 		// remove data exceeding our window
1581 		if ((uint32)drop >= buffer->size) {
1582 			// if we can accept data, or the segment is not what we'd expect,
1583 			// drop the segment (an immediate acknowledge is always triggered)
1584 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1585 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1586 
1587 			action |= IMMEDIATE_ACKNOWLEDGE;
1588 		}
1589 
1590 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1591 			// we need to remove the finish, too, as part of the data
1592 			drop--;
1593 		}
1594 
1595 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1596 		TRACE("* remove %ld bytes from the end", drop);
1597 		gBufferModule->remove_trailer(buffer, drop);
1598 	}
1599 
1600 #ifdef TRACE_TCP
1601 	if (advertisedWindow > fSendWindow) {
1602 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1603 			advertisedWindow);
1604 	}
1605 #endif
1606 
1607 	fSendWindow = advertisedWindow;
1608 	if (advertisedWindow > fSendMaxWindow)
1609 		fSendMaxWindow = advertisedWindow;
1610 
1611 	// Then look at the acknowledgement for any updates
1612 
1613 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1614 		// process acknowledged data
1615 		if (fState == SYNCHRONIZE_RECEIVED)
1616 			_MarkEstablished();
1617 
1618 		if (fSendMax < segment.acknowledge)
1619 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1620 
1621 		if (segment.acknowledge < fSendUnacknowledged) {
1622 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1623 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1624 				TRACE("Receive(): duplicate ack!");
1625 
1626 				_DuplicateAcknowledge(segment);
1627 			}
1628 
1629 			return DROP;
1630 		} else {
1631 			// this segment acknowledges in flight data
1632 
1633 			if (fDuplicateAcknowledgeCount >= 3) {
1634 				// deflate the window.
1635 				fCongestionWindow = fSlowStartThreshold;
1636 			}
1637 
1638 			fDuplicateAcknowledgeCount = 0;
1639 
1640 			if (fSendMax == segment.acknowledge)
1641 				TRACE("Receive(): all inflight data ack'd!");
1642 
1643 			if (segment.acknowledge > fSendQueue.LastSequence()
1644 					&& fState > ESTABLISHED) {
1645 				TRACE("Receive(): FIN has been acknowledged!");
1646 
1647 				switch (fState) {
1648 					case FINISH_SENT:
1649 						fState = FINISH_ACKNOWLEDGED;
1650 						T(State(this));
1651 						break;
1652 					case CLOSING:
1653 						fState = TIME_WAIT;
1654 						T(State(this));
1655 						_EnterTimeWait();
1656 						return DROP;
1657 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1658 						_Close();
1659 						break;
1660 
1661 					default:
1662 						break;
1663 				}
1664 			}
1665 
1666 			if (fState != CLOSED)
1667 				_Acknowledged(segment);
1668 		}
1669 	}
1670 
1671 	if (segment.flags & TCP_FLAG_URGENT) {
1672 		if (fState == ESTABLISHED || fState == FINISH_SENT
1673 			|| fState == FINISH_ACKNOWLEDGED) {
1674 			// TODO: Handle urgent data:
1675 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1676 			//  - signal the user that urgent data is available (SIGURG)
1677 		}
1678 	}
1679 
1680 	bool notify = false;
1681 
1682 	if (buffer->size > 0 &&	_ShouldReceive())
1683 		notify = _AddData(segment, buffer);
1684 	else {
1685 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1686 			fReceiveNext += buffer->size;
1687 
1688 		action = (action & ~KEEP) | DROP;
1689 	}
1690 
1691 	if (segment.flags & TCP_FLAG_FINISH) {
1692 		segmentLength++;
1693 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1694 			TRACE("Receive(): peer is finishing connection!");
1695 			fReceiveNext++;
1696 			notify = true;
1697 
1698 			// FIN implies PUSH
1699 			fReceiveQueue.SetPushPointer();
1700 
1701 			// we'll reply immediately to the FIN if we are not
1702 			// transitioning to TIME WAIT so we immediatly ACK it.
1703 			action |= IMMEDIATE_ACKNOWLEDGE;
1704 
1705 			// other side is closing connection; change states
1706 			switch (fState) {
1707 				case ESTABLISHED:
1708 				case SYNCHRONIZE_RECEIVED:
1709 					fState = FINISH_RECEIVED;
1710 					T(State(this));
1711 					break;
1712 				case FINISH_SENT:
1713 					// simultaneous close
1714 					fState = CLOSING;
1715 					T(State(this));
1716 					break;
1717 				case FINISH_ACKNOWLEDGED:
1718 					fState = TIME_WAIT;
1719 					T(State(this));
1720 					_EnterTimeWait();
1721 					break;
1722 				case TIME_WAIT:
1723 					_UpdateTimeWait();
1724 					break;
1725 
1726 				default:
1727 					break;
1728 			}
1729 		}
1730 	}
1731 
1732 	if (notify)
1733 		_NotifyReader();
1734 
1735 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1736 		action |= ACKNOWLEDGE;
1737 
1738 	_UpdateTimestamps(segment, segmentLength);
1739 
1740 	TRACE("Receive() Action %ld", action);
1741 
1742 	return action;
1743 }
1744 
1745 
1746 int32
1747 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1748 {
1749 	MutexLocker locker(fLock);
1750 
1751 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1752 		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1753 		buffer, buffer->size, PrintAddress(buffer->source),
1754 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1755 		segment.acknowledge,
1756 		(uint32)segment.advertised_window << fSendWindowShift);
1757 	T(Receive(this, segment,
1758 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1759 	int32 segmentAction = DROP;
1760 
1761 	switch (fState) {
1762 		case LISTEN:
1763 			segmentAction = _ListenReceive(segment, buffer);
1764 			break;
1765 
1766 		case SYNCHRONIZE_SENT:
1767 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1768 			break;
1769 
1770 		case SYNCHRONIZE_RECEIVED:
1771 		case ESTABLISHED:
1772 		case FINISH_RECEIVED:
1773 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1774 		case FINISH_SENT:
1775 		case FINISH_ACKNOWLEDGED:
1776 		case CLOSING:
1777 		case TIME_WAIT:
1778 		case CLOSED:
1779 			segmentAction = _Receive(segment, buffer);
1780 			break;
1781 	}
1782 
1783 	// process acknowledge action as asked for by the *Receive() method
1784 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1785 		SendAcknowledge(true);
1786 	else if (segmentAction & ACKNOWLEDGE)
1787 		DelayedAcknowledge();
1788 
1789 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1790 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1791 		locker.Unlock();
1792 		gSocketModule->release_socket(socket);
1793 	}
1794 
1795 	return segmentAction;
1796 }
1797 
1798 
1799 //	#pragma mark - send
1800 
1801 
1802 inline uint8
1803 TCPEndpoint::_CurrentFlags()
1804 {
1805 	// we don't set FLAG_FINISH here, instead we do it
1806 	// conditionally below depending if we are sending
1807 	// the last bytes of the send queue.
1808 
1809 	switch (fState) {
1810 		case CLOSED:
1811 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1812 
1813 		case SYNCHRONIZE_SENT:
1814 			return TCP_FLAG_SYNCHRONIZE;
1815 		case SYNCHRONIZE_RECEIVED:
1816 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1817 
1818 		case ESTABLISHED:
1819 		case FINISH_RECEIVED:
1820 		case FINISH_ACKNOWLEDGED:
1821 		case TIME_WAIT:
1822 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1823 		case FINISH_SENT:
1824 		case CLOSING:
1825 			return TCP_FLAG_ACKNOWLEDGE;
1826 
1827 		default:
1828 			return 0;
1829 	}
1830 }
1831 
1832 
1833 inline bool
1834 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1835 	uint32 segmentMaxSize, uint32 flightSize)
1836 {
1837 	if (length > 0) {
1838 		// Avoid the silly window syndrome - we only send a segment in case:
1839 		// - we have a full segment to send, or
1840 		// - we're at the end of our buffer queue, or
1841 		// - the buffer is at least larger than half of the maximum send window,
1842 		//   or
1843 		// - we're retransmitting data
1844 		if (length == segmentMaxSize
1845 			|| (fOptions & TCP_NODELAY) != 0
1846 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1847 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1848 			return true;
1849 	}
1850 
1851 	// check if we need to send a window update to the peer
1852 	if (segment.advertised_window > 0) {
1853 		// correct the window to take into account what already has been advertised
1854 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1855 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1856 
1857 		// if we can advertise a window larger than twice the maximum segment
1858 		// size, or half the maximum buffer size we send a window update
1859 		if (window >= (fReceiveMaxSegmentSize << 1)
1860 			|| window >= (socket->receive.buffer_size >> 1))
1861 			return true;
1862 	}
1863 
1864 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1865 			| TCP_FLAG_RESET)) != 0)
1866 		return true;
1867 
1868 	// We do have urgent data pending
1869 	if (fSendUrgentOffset > fSendNext)
1870 		return true;
1871 
1872 	// there is no reason to send a segment just now
1873 	return false;
1874 }
1875 
1876 
1877 status_t
1878 TCPEndpoint::_SendQueued(bool force)
1879 {
1880 	return _SendQueued(force, fSendWindow);
1881 }
1882 
1883 
1884 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1885 	specific flags that need to be sent.
1886 */
1887 status_t
1888 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1889 {
1890 	if (fRoute == NULL)
1891 		return B_ERROR;
1892 
1893 	// in passive state?
1894 	if (fState == LISTEN)
1895 		return B_ERROR;
1896 
1897 	tcp_segment_header segment(_CurrentFlags());
1898 
1899 	if ((fOptions & TCP_NOOPT) == 0) {
1900 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1901 			segment.options |= TCP_HAS_TIMESTAMPS;
1902 			segment.timestamp_reply = fReceivedTimestamp;
1903 			segment.timestamp_value = tcp_now();
1904 		}
1905 
1906 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1907 			&& fSendNext == fInitialSendSequence) {
1908 			// add connection establishment options
1909 			segment.max_segment_size = fReceiveMaxSegmentSize;
1910 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1911 				segment.options |= TCP_HAS_WINDOW_SCALE;
1912 				segment.window_shift = fReceiveWindowShift;
1913 			}
1914 		}
1915 	}
1916 
1917 	size_t availableBytes = fReceiveQueue.Free();
1918 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1919 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1920 	else
1921 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1922 
1923 	segment.acknowledge = fReceiveNext.Number();
1924 
1925 	// Process urgent data
1926 	if (fSendUrgentOffset > fSendNext) {
1927 		segment.flags |= TCP_FLAG_URGENT;
1928 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1929 	} else {
1930 		fSendUrgentOffset = fSendUnacknowledged.Number();
1931 			// Keep urgent offset updated, so that it doesn't reach into our
1932 			// send window on overlap
1933 		segment.urgent_offset = 0;
1934 	}
1935 
1936 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1937 		sendWindow = fCongestionWindow;
1938 
1939 	// fSendUnacknowledged
1940 	//  |    fSendNext      fSendMax
1941 	//  |        |              |
1942 	//  v        v              v
1943 	//  -----------------------------------
1944 	//  | effective window           |
1945 	//  -----------------------------------
1946 
1947 	// Flight size represents the window of data which is currently in the
1948 	// ether. We should never send data such as the flight size becomes larger
1949 	// than the effective window. Note however that the effective window may be
1950 	// reduced (by congestion for instance), so at some point in time flight
1951 	// size may be larger than the currently calculated window.
1952 
1953 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1954 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
1955 
1956 	if (consumedWindow > sendWindow) {
1957 		sendWindow = 0;
1958 		// TODO enter persist state? try to get a window update.
1959 	} else
1960 		sendWindow -= consumedWindow;
1961 
1962 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1963 		// send one byte of data to ask for a window update
1964 		// (triggered by the persist timer)
1965 		sendWindow = 1;
1966 	}
1967 
1968 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1969 	tcp_sequence previousSendNext = fSendNext;
1970 
1971 	do {
1972 		uint32 segmentMaxSize = fSendMaxSegmentSize
1973 			- tcp_options_length(segment);
1974 		uint32 segmentLength = min_c(length, segmentMaxSize);
1975 
1976 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1977 			if (state_needs_finish(fState))
1978 				segment.flags |= TCP_FLAG_FINISH;
1979 			if (length > 0)
1980 				segment.flags |= TCP_FLAG_PUSH;
1981 		}
1982 
1983 		// Determine if we should really send this segment
1984 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1985 			segmentMaxSize, flightSize)) {
1986 			if (fSendQueue.Available()
1987 				&& !gStackModule->is_timer_active(&fPersistTimer)
1988 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1989 				_StartPersistTimer();
1990 			break;
1991 		}
1992 
1993 		net_buffer *buffer = gBufferModule->create(256);
1994 		if (buffer == NULL)
1995 			return B_NO_MEMORY;
1996 
1997 		status_t status = B_OK;
1998 		if (segmentLength > 0)
1999 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2000 		if (status < B_OK) {
2001 			gBufferModule->free(buffer);
2002 			return status;
2003 		}
2004 
2005 		LocalAddress().CopyTo(buffer->source);
2006 		PeerAddress().CopyTo(buffer->destination);
2007 
2008 		uint32 size = buffer->size;
2009 		segment.sequence = fSendNext.Number();
2010 
2011 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
2012 			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
2013 			"\tlen %lu first %lu last %lu",
2014 			buffer, buffer->size, PrintAddress(buffer->source),
2015 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2016 			segment.acknowledge, segment.advertised_window,
2017 			fCongestionWindow, fSlowStartThreshold, segmentLength,
2018 			(uint32)fSendQueue.FirstSequence(),
2019 			(uint32)fSendQueue.LastSequence());
2020 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2021 			fSendQueue.LastSequence()));
2022 
2023 		PROBE(buffer, sendWindow);
2024 		sendWindow -= buffer->size;
2025 
2026 		status = add_tcp_header(AddressModule(), segment, buffer);
2027 		if (status != B_OK) {
2028 			gBufferModule->free(buffer);
2029 			return status;
2030 		}
2031 
2032 		// Update send status - we need to do this before we send the data
2033 		// for local connections as the answer is directly handled
2034 
2035 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2036 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2037 			segment.max_segment_size = 0;
2038 			size++;
2039 		}
2040 
2041 		if (segment.flags & TCP_FLAG_FINISH)
2042 			size++;
2043 
2044 		uint32 sendMax = fSendMax.Number();
2045 		fSendNext += size;
2046 		if (fSendMax < fSendNext)
2047 			fSendMax = fSendNext;
2048 
2049 		fReceiveMaxAdvertised = fReceiveNext
2050 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2051 
2052 		status = next->module->send_routed_data(next, fRoute, buffer);
2053 		if (status < B_OK) {
2054 			gBufferModule->free(buffer);
2055 
2056 			fSendNext = segment.sequence;
2057 			fSendMax = sendMax;
2058 				// restore send status
2059 			return status;
2060 		}
2061 
2062 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2063 			fLastAcknowledgeSent = segment.acknowledge;
2064 
2065 		length -= segmentLength;
2066 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2067 			| TCP_FLAG_FINISH);
2068 	} while (length > 0);
2069 
2070 	// if we sent data from the beggining of the send queue,
2071 	// start the retransmition timer
2072 	if (previousSendNext == fSendUnacknowledged
2073 		&& fSendNext > previousSendNext) {
2074 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
2075 			fRetransmitTimeout);
2076 
2077 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2078 	}
2079 
2080 	return B_OK;
2081 }
2082 
2083 
2084 int
2085 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2086 {
2087 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2088 }
2089 
2090 
2091 status_t
2092 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2093 {
2094 	if (fRoute == NULL) {
2095 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2096 		if (fRoute == NULL)
2097 			return ENETUNREACH;
2098 
2099 		if ((fRoute->flags & RTF_LOCAL) != 0)
2100 			fFlags |= FLAG_LOCAL;
2101 	}
2102 
2103 	// make sure connection does not already exist
2104 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2105 		fRoute->interface->address);
2106 	if (status < B_OK)
2107 		return status;
2108 
2109 	fInitialSendSequence = system_time() >> 4;
2110 	fSendNext = fInitialSendSequence;
2111 	fSendUnacknowledged = fInitialSendSequence;
2112 	fSendMax = fInitialSendSequence;
2113 	fSendUrgentOffset = fInitialSendSequence;
2114 
2115 	// we are counting the SYN here
2116 	fSendQueue.SetInitialSequence(fSendNext + 1);
2117 
2118 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2119 
2120 	// Compute the window shift we advertise to our peer - if it doesn't support
2121 	// this option, this will be reset to 0 (when its SYN is received)
2122 	fReceiveWindowShift = 0;
2123 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2124 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2125 		fReceiveWindowShift++;
2126 	}
2127 
2128 	return B_OK;
2129 }
2130 
2131 
2132 void
2133 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2134 {
2135 	size_t previouslyUsed = fSendQueue.Used();
2136 
2137 	fSendQueue.RemoveUntil(segment.acknowledge);
2138 	fSendUnacknowledged = segment.acknowledge;
2139 
2140 	if (fSendNext < fSendUnacknowledged)
2141 		fSendNext = fSendUnacknowledged;
2142 
2143 	if (fSendUnacknowledged == fSendMax)
2144 		gStackModule->cancel_timer(&fRetransmitTimer);
2145 
2146 	if (fSendQueue.Used() < previouslyUsed) {
2147 		// this ACK acknowledged data
2148 
2149 		if (segment.options & TCP_HAS_TIMESTAMPS)
2150 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2151 		else {
2152 			// TODO: Fallback to RFC 793 type estimation
2153 		}
2154 
2155 		if (is_writable(fState)) {
2156 			// notify threads waiting on the socket to become writable again
2157 			fSendList.Signal();
2158 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2159 		}
2160 
2161 		if (fCongestionWindow < fSlowStartThreshold)
2162 			fCongestionWindow += fSendMaxSegmentSize;
2163 	}
2164 
2165 	if (fCongestionWindow >= fSlowStartThreshold) {
2166 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2167 
2168 		if (increment < fCongestionWindow)
2169 			increment = 1;
2170 		else
2171 			increment /= fCongestionWindow;
2172 
2173 		fCongestionWindow += increment;
2174 	}
2175 
2176 	// if there is data left to be send, send it now
2177 	if (fSendQueue.Used() > 0)
2178 		_SendQueued();
2179 }
2180 
2181 
2182 void
2183 TCPEndpoint::_Retransmit()
2184 {
2185 	TRACE("Retransmit()");
2186 	_ResetSlowStart();
2187 	fSendNext = fSendUnacknowledged;
2188 	_SendQueued();
2189 }
2190 
2191 
2192 void
2193 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2194 {
2195 	int32 rtt = roundTripTime;
2196 
2197 	// "smooth" round trip time as per Van Jacobson
2198 	rtt -= fRoundTripTime / 8;
2199 	fRoundTripTime += rtt;
2200 	if (rtt < 0)
2201 		rtt = -rtt;
2202 	rtt -= fRoundTripDeviation / 4;
2203 	fRoundTripDeviation += rtt;
2204 
2205 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2206 		* kTimestampFactor;
2207 
2208 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2209 		roundTripTime);
2210 }
2211 
2212 
2213 void
2214 TCPEndpoint::_ResetSlowStart()
2215 {
2216 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2217 		2 * fSendMaxSegmentSize);
2218 	fCongestionWindow = fSendMaxSegmentSize;
2219 }
2220 
2221 
2222 //	#pragma mark - timer
2223 
2224 
2225 /*static*/ void
2226 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2227 {
2228 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2229 	T(Timer(endpoint, "retransmit"));
2230 
2231 	MutexLocker locker(endpoint->fLock);
2232 	if (!locker.IsLocked())
2233 		return;
2234 
2235 	endpoint->_Retransmit();
2236 }
2237 
2238 
2239 /*static*/ void
2240 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2241 {
2242 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2243 	T(Timer(endpoint, "persist"));
2244 
2245 	MutexLocker locker(endpoint->fLock);
2246 	if (!locker.IsLocked())
2247 		return;
2248 
2249 	// the timer might not have been canceled early enough
2250 	if (endpoint->State() == CLOSED)
2251 		return;
2252 
2253 	endpoint->_SendQueued(true);
2254 }
2255 
2256 
2257 /*static*/ void
2258 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2259 {
2260 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2261 	T(Timer(endpoint, "delayed ack"));
2262 
2263 	MutexLocker locker(endpoint->fLock);
2264 	if (!locker.IsLocked())
2265 		return;
2266 
2267 	// the timer might not have been canceled early enough
2268 	if (endpoint->State() == CLOSED)
2269 		return;
2270 
2271 	endpoint->SendAcknowledge(true);
2272 }
2273 
2274 
2275 /*static*/ void
2276 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2277 {
2278 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2279 	T(Timer(endpoint, "time-wait"));
2280 
2281 	MutexLocker locker(endpoint->fLock);
2282 	if (!locker.IsLocked())
2283 		return;
2284 
2285 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2286 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2287 		return;
2288 	}
2289 
2290 	locker.Unlock();
2291 
2292 	gSocketModule->release_socket(endpoint->socket);
2293 }
2294 
2295 
2296 //	#pragma mark -
2297 
2298 
2299 void
2300 TCPEndpoint::Dump() const
2301 {
2302 	kprintf("TCP endpoint %p\n", this);
2303 	kprintf("  state: %s\n", name_for_state(fState));
2304 	kprintf("  flags: 0x%lx\n", fFlags);
2305 #if KDEBUG
2306 	kprintf("  lock: { %p, holder: %ld }\n", &fLock, fLock.holder);
2307 #endif
2308 	kprintf("  accept sem: %ld\n", fAcceptSemaphore);
2309 	kprintf("  options: 0x%lx\n", (uint32)fOptions);
2310 	kprintf("  send\n");
2311 	kprintf("    window shift: %u\n", fSendWindowShift);
2312 	kprintf("    unacknowledged: %lu\n", fSendUnacknowledged.Number());
2313 	kprintf("    next: %lu\n", fSendNext.Number());
2314 	kprintf("    max: %lu\n", fSendMax.Number());
2315 	kprintf("    urgent offset: %lu\n", fSendUrgentOffset.Number());
2316 	kprintf("    window: %lu\n", fSendWindow);
2317 	kprintf("    max window: %lu\n", fSendMaxWindow);
2318 	kprintf("    max segment size: %lu\n", fSendMaxSegmentSize);
2319 	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2320 #if DEBUG_BUFFER_QUEUE
2321 	fSendQueue.Dump();
2322 #endif
2323 	kprintf("    last acknowledge sent: %lu\n", fLastAcknowledgeSent.Number());
2324 	kprintf("    initial sequence: %lu\n", fInitialSendSequence.Number());
2325 	kprintf("  receive\n");
2326 	kprintf("    window shift: %u\n", fReceiveWindowShift);
2327 	kprintf("    next: %lu\n", fReceiveNext.Number());
2328 	kprintf("    max advertised: %lu\n", fReceiveMaxAdvertised.Number());
2329 	kprintf("    window: %lu\n", fReceiveWindow);
2330 	kprintf("    max segment size: %lu\n", fReceiveMaxSegmentSize);
2331 	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2332 		fReceiveQueue.Size());
2333 #if DEBUG_BUFFER_QUEUE
2334 	fReceiveQueue.Dump();
2335 #endif
2336 	kprintf("    initial sequence: %lu\n", fInitialReceiveSequence.Number());
2337 	kprintf("    duplicate acknowledge count: %lu\n",
2338 		fDuplicateAcknowledgeCount);
2339 	kprintf("  round trip time: %ld (deviation %ld)\n", fRoundTripTime,
2340 		fRoundTripDeviation);
2341 	kprintf("  retransmit timeout: %lld\n", fRetransmitTimeout);
2342 	kprintf("  congestion window: %lu\n", fCongestionWindow);
2343 	kprintf("  slow start threshold: %lu\n", fSlowStartThreshold);
2344 }
2345 
2346